knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
# Load libraries
library(caret)
library(tidyverse)
library(cowplot)
library(ggplot2)
library(ggpattern)
library(ggsci)
library(utils)
library(reshape2)
library(stringr)
library(gridGraphics)
library(zellkonverter)
library(SingleCellExperiment)
library(cytomapper)
library(scater)
library(ggpubr)
library(dittoSeq)
# Set general input paths to all analysis files
analysis.path <- "/mnt/bb_dqbm_volume/analysis"
# Set path to masks
tonsil.path <- "/mnt/bb_dqbm_volume/data/Tonsil_th152"
lung.path <- "/mnt/bb_dqbm_volume/data/Immucan_lung"
# Import helper fncs
source("../analysis/helpers/helper_fnc.R")
first, we look at U computed using different training datasets. For this, we read in the results from CISI training on the Tonsil th152 and the Immucan lung dataset (tonsil_vs_lung.py).
## Read results
# Read in all results for tonsil and lung comparison into one dataframe
files <- list.files(analysis.path, 'simulation_results.txt',
full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
files <- files[grepl(
"tonsil_vs_lung|Tonsil_th152/training/full|Immucan_lung/training/subset", files)]
res <- lapply(files, read_results, type="res") %>%
bind_rows()
# Get information about best U files
u_files <- res %>%
dplyr::filter(dataset==training) %>%
dplyr::select(k, training, `Best crossvalidation fold`, datasize) %>%
distinct()
# Harmonize all dataset names
patterns <- unique(unlist(lapply(res$training, function(name){
if(length(str_split(name, "_")[[1]])==1){
name
}
})))
res$training <- unlist(lapply(res$training, function(pat){
patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
res$dataset <- unlist(lapply(res$dataset, function(pat){
patterns[str_detect(pat, fixed(patterns, ignore_case=TRUE))]
}))
## Read U
u <- lapply(1:(nrow(u_files)), function(i){
file <- file.path(analysis.path, u_files[i, "training"], "training",
u_files[i, "datasize"],
paste0("k_", u_files[i, "k"]),
paste0("crossvalidation_", u_files[i, "Best crossvalidation fold"]),
"gene_modules.csv")
u_temp <- read_U(file, "training")
}) %>% bind_rows() %>%
dplyr::rename(dataset=rep)
## Read X_test and X_simulated
# Specify X_test and X_simulated files to be read in
X.files <- list.files(analysis.path, "X_", full.names=TRUE, recursive=TRUE)
# TODO: change this line if more files come along and need to exclude
X.files <- X.files[grepl(
"tonsil_vs_lung|Tonsil_th152/training/full|Immucan_lung/training/subset", X.files)]
X.files <- X.files[!grepl("_processed", X.files)]
# Read in all sce experiments from saved anndata and save additional info in metadata
# (e.g. which dataset is used in training and testing, which k was used and if it
# is the ground truth or simulated X)
# Remove neg. values and transform counts?
X.list <- lapply(X.files, read_results, type="x")
X.list <- lapply(X.list, function(sce.temp){
pat <- metadata(sce.temp)
metadata(sce.temp)$dataset <- patterns[str_detect(pat$dataset,
fixed(patterns, ignore_case=TRUE))]
metadata(sce.temp)$training <- patterns[str_detect(pat$training,
fixed(patterns, ignore_case=TRUE))]
assay.name <- names(assays(sce.temp))
for (a in assay.name[-1]) {
assay(sce.temp, a) <- NULL
}
assay(sce.temp, "exprs") <- asinh(counts(sce.temp)/1)
sce.temp
})
# Add reduced dimensions
set.seed(220225)
X.list <- lapply(X.list, function(sce){
sce <- runUMAP(sce, exprs_values="exprs")
sce <- runTSNE(sce, exprs_values="exprs")
sce
})
# Save SCE to avoid computing UMAP, TSNE, ... all the time since it is computationally
# expensive
# lapply(1:(length(X.list)), function(i){saveRDS(X.list[i],
# paste0(gsub("\\..*", "", X.files[i]),
# "_processed.",
# gsub(".*\\.", "", basename(X.files[i]))))})
## Read masks
# Read in masks for tonsil data
masks.tonsil <- loadImages(file.path(tonsil.path, "steinbock/masks_deepcell"), as.is = TRUE)
mcols(masks.tonsil) <- DataFrame(sample_id=names(masks.tonsil))
# Read in masks for lung data
masks.lung <- loadImages(file.path(lung.path, "masks"), as.is = TRUE)
mcols(masks.lung) <- DataFrame(sample_id=names(masks.lung))
## Read random forest classifier
# Read RF classifier for the lung dataset
rffit.lung <- readRDS(file.path(lung.path, "rffit_v7_all_markers.rds"))
# For each different measurement of training results, plot a barplot comparing
# diff. k-sparsities, simulation types and training-test set combinations
for (measure in names(res)[2:10]) {
cat('##### ', measure, '\n')
# Reorder dataframe for plotting
data_temp <- res %>%
dplyr::select(dataset, training, simulation, k, !!measure) %>%
mutate(group=paste(training, dataset, sep="_"))
# Create barplot
barplot <- ggplot(data_temp, aes(x=group, y=!!sym(measure), fill=k, pattern=simulation)) +
geom_bar_pattern(stat="identity",
position=position_dodge(preserve="single"),
width=0.6,
color="black",
pattern_fill="black",
pattern_angle=45,
pattern_density=0.3,
pattern_spacing=0.01,
pattern_key_scale_factor=0.6) +
scale_y_continuous(limits=c(0, 1)) +
scale_fill_npg() +
scale_pattern_manual(values=c(composite="stripe", noisy="none"),
labels=c("Composite", "Noisy")) +
labs(x="Training_Test Dataset", y=str_to_title(measure), pattern="Simulation Type") +
guides(pattern=guide_legend(override.aes=list(fill="white")),
fill=guide_legend(override.aes=list(pattern="none"))) +
theme_cowplot()
print(barplot)
cat("\n\n")
}
cor <- plot_U(u, "k", "dataset")
plot_U_membership(u, "k", "dataset")
#### Mantel Test between U’s
# Calculate mantel test between U's of tonsil and lung for all k's
mantel <- lapply(cor, function(l){
mantel_test(l[[1]], l[[2]])$mantel_r
}) %>%
as.data.frame(col.names=names(cor), check.names=FALSE)
knitr::kable(mantel, digits=2,
col.names=paste0("k = ", names(mantel)))
| k = 1 | k = 3 |
|---|---|
| 0.67 | 0.42 |
# Subset to training and test of tonsil data
X.plotList <- keep(X.list, function(x){
metadata(x)$training=="tonsil" & metadata(x)$dataset=="tonsil" & metadata(x)$k=="1"})
names(X.plotList) <- lapply(X.plotList,
function(x){metadata(x)$ground_truth})
# Call plot_cells to get individual plots for each roi for decomposed and true
# results
X.imgList <- plot_cells(X.plotList, masks.tonsil,
"CD20")
# Plot decomposed vs true results for test roi (002)
X.img <- plot_grid(plotlist=append(X.imgList[grepl("20220520_TsH_th152_cisi1_002",
names(X.imgList))],
X.imgList[grepl("legend",
names(X.imgList))]),
ncol=2, labels=names(X.plotList),
label_size=15, hjust=c(-2, -1.5),
vjust=1, scale=0.9)
X.img
# Rename list of tonsil data for nicer titles in plotting
X.plotListRenamed <- lapply(names(X.plotList), function(n){
rownames(X.plotList[[n]]) <- paste(rownames(X.plotList[[n]]),
n, sep="\n")
reducedDims(X.plotList[[n]]) <- NULL
X.plotList[[n]]
})
names(X.plotListRenamed) <- names(X.plotList)
# Add decomposed and true dataset of tonsil into one SCE
X.overlayed <- do.call("rbind", X.plotListRenamed)
# Define protein of interest
pois <- c("CD20\nsimulated", "CD20\nground_truth")
# Plot cells coloured according to decomposed and true poi values
# (if similar in both should have a mixture of colours, e.g. orange)
X.imgDiff <- plotCells(mask=masks.tonsil[unique(colData(X.overlayed)$sample_id)],
object=X.overlayed,
cell_id="ObjectNumber", img_id="sample_id", colour_by=pois,
return_plot=TRUE, image_title=list(cex=1),
scale_bar=list(cex=1, lwidth=5),
legend=list(colour_by.title.cex=0.7, colour_by.labels.cex=0.7))
ggdraw(X.imgDiff$plot)
Plot of arcsinh transformed counts coloured by defined celltype.
# Subset to dataset trained and tested on the Immucan lung dataset
X.exprsList <- keep(X.list, function(x){
metadata(x)$training=="lung" & metadata(x)$dataset=="lung" & metadata(x)$k=="1"})
names(X.exprsList) <- lapply(X.exprsList,
function(x){metadata(x)$ground_truth})
# Plot asinh transformed counts of proteins of interest of decomposed and true
# datasets coloured by different celltypes
X.Exprs <- plot_grid(plot_exprs(X.exprsList, "B", "CD3", "CD20"),
plot_exprs(X.exprsList, "BnT", "CD3", "CD20"),
plot_exprs(X.exprsList, "Neutro", "MPO", "CD15"),
ncol=1)
print(X.Exprs)
Next, we have a look of normalization in training.
# # TODO: Check that this works and figure out what exactly we want to look at here
#
# # Subset to dataset trained and tested on the Immucan lung dataset with k=1
# # with and without normalization for the decomposed counts
# X.normList <- keep(X.list, function(x){
# metadata(x)$training=="lung" & metadata(x)$dataset=="lung" &
# (metadata(x)$k=="1" | metadata(x)$k=="norm") &
# metadata(x)$ground_truth=="simulated"})
# names(X.normList) <- lapply(X.normList,
# function(x){metadata(x)$k})
#
# # Plot asinh transformed counts of proteins of interest of decomposed data trained
# # with and without normalization
# X.Norms <- plot_grid(plot_exprs(X.exprsList, "", "CD3", "CD20"),
# plot_exprs(X.exprsList, "", "CD3", "VISTA"),
# ncol=1)
# print(X.Norms)
Scatterplot of ground truth vs decomposed results per protein for k=1 and asinh transformed counts.
# Add all X_test counts together into dataframe for easier plotting
aoi <- "exprs"
X.df <- lapply(X.list, function(sce){
counts.long <- as.data.frame(assays(sce)[[aoi]]) %>%
mutate(protein=rownames(.)) %>%
melt() %>%
dplyr::rename(!!as.symbol(metadata(sce)$ground_truth):=value,
cell=variable) %>%
mutate(k=metadata(sce)$k,
dataset=paste(metadata(sce)$training, metadata(sce)$dataset, sep="_"))
}) %>% bind_rows() %>%
group_by(protein, cell, k, dataset) %>%
summarise_all(na.omit)
# For each test/training dataset combination plot for each true vs decomposed
# results in scatter plot and add diagonal and regression line, as well as
# R (pearson correlation coefficient)
for (i in unique(X.df$dataset)) {
cat('#####', i, '\n')
proteinPlot <- ggscatter(X.df %>%
dplyr::filter(k=="1" & dataset==i),
x="ground_truth", y="simulated",
add="reg.line",
color=pal_npg("nrc")("1"),
add.params=list(color=pal_npg("nrc")("4")[4],
size=2)) +
facet_wrap(~ protein, scales="free", ncol=5) +
theme_cowplot(10) +
geom_abline(slope=1, linetype="dashed") +
stat_cor(size=2)
print(proteinPlot)
cat('\n\n')
}
Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts.
# Calculate correlations between ground truth and simulated data for each protein
X.cor <- X.df %>%
group_by(k, dataset, protein) %>%
summarize(correlation=cor(ground_truth, simulated),
median=median(ground_truth),
max=max(ground_truth),
var=var(ground_truth))
# Plot correlations for k=1 for each test/training dataset combination
for (i in unique(X.df$dataset)) {
cat('#####', i, '\n')
proteinCorr <- ggplot(X.cor %>%
dplyr::filter(k=="1" & dataset==i),
aes(x=protein, y=correlation, fill=protein)) +
geom_bar(stat="identity") +
theme_cowplot() +
scale_fill_igv() +
guides(fill=FALSE) +
coord_flip()
print(proteinCorr)
cat('\n\n')
}
Correlation between ground truth and decomposed results per protein for k=1 and asinh transformed counts compared to protein characteristics of gorund truth: Median, max and variance.
# Define charachteristics of interest
coi <- c("median", "max", "var")
# Plot correlations for k=1 for each test/training dataset combination
for (i in coi) {
cat('#####', i, '\n')
proteinChar <- ggscatter(X.cor %>%
dplyr::filter(k=="1"),
x="correlation", y=i,
add="reg.line",
color=pal_npg("nrc")("1"),
add.params=list(color=pal_npg("nrc")("4")[4],
size=2)) +
facet_wrap(~ dataset, scales="free", ncol=2) +
theme_cowplot(10) +
stat_cor(size=2)
print(proteinChar)
cat('\n\n')
}
# Calculate correlations between all ground truth cells and for each cell retain
# its max correlation to another cell
# If max >= 0.995, then the cell is marked as having a highly correlated cell in
# ground truth
sce.maxCor <- counts(X.exprsList[[2]]) %>%
cor %>%
as.data.frame %>%
rownames_to_column(var='cell_i') %>%
gather(key="cell_j", value="cor", -cell_i) %>%
dplyr::filter(!(cell_i==cell_j)) %>%
group_by(cell_i) %>%
summarise(max_cor=max(cor, na.rm=TRUE)) %>%
dplyr::filter(max_cor>=0.995)
# Add above information to sce experiments
colData(X.exprsList[[1]])$high_cor <- ifelse(
colData(X.exprsList[[1]])$cell_id %in% sce.maxCor$cell_i,
"high",
"low")
colData(X.exprsList[[2]])$high_cor <- colData(X.exprsList[[1]])$high_cor
# Create empty list for plots
X.redDimlist <- list
# Extract for simulated and true results for UMAP and TSNE and plot and
# colour highly correlated ground truth cells
for(sce in X.exprsList){
for(method in c("UMAP", "TSNE")){
X.redDimlist <- append(X.redDimlist,
list(dittoDimPlot(sce, var="high_cor",
reduction.use=method, size=0.2) +
scale_color_manual(values=c("high"="red", "low"="grey"),
name="Highly Correlated to Another Cell",
labels=c("high"="true", "low"="false")) +
theme_cowplot() +
theme(legend.position="bottom") +
guides(color=guide_legend(nrow=1))))
}
}
# Remove empty plot at the beginning and add legend to the end of the plot list
X.redDimlist <- c(lapply(X.redDimlist[-1], function(p){
p + theme(legend.position="none")
}), list(get_legend(X.redDimlist[2])))
# Plot simulated and true reduced dim on top of each other
X.redDimPlot <- plot_grid(plotlist=X.redDimlist,
ncol=2, labels=rep(names(X.exprsList), each=2),
label_size=15, hjust=c(-2, -1.5), vjust=1, scale=0.9)
print(X.redDimPlot)
#### Cell Phenotyping
# # Subset to dataset trained and tested on the Immucan lung dataset
# lungSim.list <- keep(X.list, function(x){
# metadata(x)$dataset=="lung" & metadata(x)$k=="1" & metadata(x)$ground_truth=="simulated"})
# names(lungSim.list) <- lapply(lungSim.list,
# function(x){metadata(x)$training})
# lungSim.exprs <- lapply(lungSim.list, function(sce){
# cur_mat <- t(assay(sce, "exprs"))
# # Predict cell phenotypes in test data
# cur_pred <- predict(rffit.lung,
# newdata=cur_mat)
# cur_pred
# })
#
# cm <- confusionMatrix(data=cur_pred,
# reference=factor(lungSim.list$cell_labels),
# mode="everything")
#
# cm